#importing required libraries and setting matplotlib to inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from statsmodels.formula.api import ols
%matplotlib inline
#grabbing the dataset
df = pd.read_csv('kc_house_data.csv')
#dropping id and date columns
df.drop(['id', 'date', 'lat', 'long', 'zipcode'], axis=1, inplace=True)
#viewing the first 15 rows
df.head(15)
#checking value in sqft_basement
df.sqft_basement.value_counts()
#replacing missing data in sqft_basement with column mean
sqft_basement_numerical = df.sqft_basement[df.sqft_basement != '?']
sqft_basement_numerical = sqft_basement_numerical.astype(float)
sqft_basement_numerical_mean = round(sqft_basement_numerical.mean(),1)
df['sqft_basement'] = (df['sqft_basement'].map(lambda x: sqft_basement_numerical_mean if x == '?' else x)).astype(float)
#checking for null values
df.isna().sum()
#replacing null values in waterfront with 0
df['view'] = df['view'].fillna(0)
df['waterfront'] = df['waterfront'].fillna(0)
df['yr_renovated'] = df['yr_renovated'].fillna(0)
#creating copy of df before transformations
df_original = df.copy()
#intial histogram of all variables
df.hist(figsize=(15,15))
#checking bedrooms for outliers
df.bedrooms.value_counts()
#histogram before removal of outliers
df.bedrooms.hist()
#removing outliers
df.drop(df.index[df['bedrooms'] >= 8], inplace=True)
#histogram after removal of outliers
df.bedrooms.hist()
#bathrooms histogram
df.bathrooms.hist()
#using log transformation on bathrooms
(np.log(df.bathrooms)).hist()
#transforming bathrooms
df.bathrooms = np.log(df.bathrooms)
#sqft_living histogram
df.sqft_living.hist()
#checking count of outliers
print(len(df[df.sqft_living > 8000]))
print(len(df[df.sqft_living > 6000]))
print(len(df[df.sqft_living > 4000]))
#removing outliers
df.drop(df.index[df['sqft_living'] > 4000], inplace=True)
#histogram after removal of outliers
df.sqft_living.hist()
#histogram of square root transformed sqft_living
(np.sqrt(df.sqft_living)).hist()
#square root transformation on sqft_living
df.sqft_living = np.sqrt(df.sqft_living)
#sqft_above histogram
df.sqft_above.hist()
#square root transformation
df.sqft_above = np.sqrt(df.sqft_above)
#square root transformed histogram
df.sqft_above.hist()
#sqft_basement histogram
df.sqft_basement.hist()
#creating new column that declares whether or not a home has a basement
df['basement'] = df.apply(lambda row: 0 if row.sqft_basement == 0 else 1, axis=1)
#new binary basement histogram
df.basement.hist()
#viewing stats of yr_built
df.yr_built.describe()
#creating new column age that declares the age of the home
df['age'] = (df.yr_built.max() - df.yr_built + 1)
#histogram of new age column
df.age.hist()
#square root transformation on age
df.age = np.sqrt(df.age)
df.age.hist()
#removal of old yr_built column
df.drop('yr_built',axis=1,inplace=True)
#sqft_lot histogram
df.sqft_lot.hist()
#checking number of outliers
len(df[df.sqft_lot>200000])
#removal of outliers
df.drop(df.index[df['sqft_lot'] > 200000], inplace=True)
#histogram after removal of outliers
df.sqft_lot.hist()
#log transformation of sqft_lot
df.sqft_lot = np.log(df.sqft_lot)
df.sqft_lot.hist()
#histogram of sqft_lot15
df.sqft_lot15.hist()
#log transformed sqft_lot15
df.sqft_lot15 = np.log(df.sqft_lot15)
df.sqft_lot15.hist()
#view histogram
df.view.hist()
#unique values of view
df.view.unique()
#converting all view values above 0 to 1 to signify that the home has been viewed
df['viewed'] = df.view
i = 4
while i > 0:
df.viewed = df.viewed.replace(i, 1)
i -= 1
df.viewed.hist()
#price histogram
df.price.hist()
#log transformation on price
df['price_log'] = np.log(df.price)
df.price_log.hist()
#updated histogram of all variables after transformations and adjustments
df.hist(figsize=(14,14))
#stepwise selection function
import statsmodels.api as sm
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df.drop(['price', 'price_log'], axis=1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df.drop(['price', 'price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
#creating a plot of predicted values vs actual values
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df['# Predictors'] = predictors
results_df['R-squared'] = reg_score
results_df['MSE % Difference'] = mse_diffs
results_df['Added Predictor'] = added_pred
plt.show()
results_df
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_original.drop(['price'], axis=1)
y = df_original.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_original = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_original.drop('price', axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_original['# Predictors'] = predictors
results_df_original['R-squared'] = reg_score
results_df_original['MSE % Difference'] = mse_diffs
results_df_original['Added Predictor'] = added_pred
plt.show()
results_df_original
#heatmap to check for correlations between variables where corr > 0.75
sns.heatmap(df.corr(),center=0);
sns.heatmap(abs(df.corr())>0.75, center=0)
#creating dataframe that will remove some of the variables with high correlation to other variables
df_adj = df.copy()
to_drop = ['view', 'sqft_lot15', 'sqft_above', 'sqft_basement']
df_adj.drop(to_drop, axis=1, inplace=True)
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_adj.drop(['price','price_log'], axis=1)
y = df_adj.price_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_adj = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_adj.drop(['price','price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_adj['# Predictors'] = predictors
results_df_adj['R-squared'] = reg_score
results_df_adj['MSE % Difference'] = mse_diffs
results_df_adj['Added Predictor'] = added_pred
plt.show()
results_df_adj
#creating dataframe that will remove some of the variables with high correlation to other variables
df_adj2 = df.copy()
to_drop = ['viewed', 'basement']
df_adj2.drop(to_drop, axis=1, inplace=True)
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_adj2.drop(['price', 'price_log'], axis=1)
y = df_adj2.price_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_adj2 = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_adj2.drop(['price', 'price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_adj2['# Predictors'] = predictors
results_df_adj2['R-squared'] = reg_score
results_df_adj2['MSE % Difference'] = mse_diffs
results_df_adj2['Added Predictor'] = added_pred
plt.show()
results_df_adj2
df.price.describe()
df.price.hist()
df_s = df[df.price <= 615000]
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_s.drop(['price', 'price_log'], axis=1)
y = df_s.price_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_s = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_s.drop(['price', 'price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_s['# Predictors'] = predictors
results_df_s['R-squared'] = reg_score
results_df_s['MSE % Difference'] = mse_diffs
results_df_s['Added Predictor'] = added_pred
plt.show()
results_df_s
df_s2 = df[df.price <= 440000]
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_s2.drop(['price', 'price_log'], axis=1)
y = df_s2.price_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_s2 = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_s2.drop(['price', 'price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_s2['# Predictors'] = predictors
results_df_s2['R-squared'] = reg_score
results_df_s2['MSE % Difference'] = mse_diffs
results_df_s2['Added Predictor'] = added_pred
plt.show()
results_df_s2
df_s3 = df[df.price >= 440000]
#RFE regression of all predictors using repeated k-fold
#setting X, y and creating train/test split
X = df_s3.drop(['price', 'price_log'], axis=1)
y = df_s3.price_log
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)
#initializing counter variable 'i' and creating lists to add to over each iteration
i = 1
results_df_s3 = pd.DataFrame()
predictors = list()
reg_score = list()
mse_diffs = list()
added_pred = list()
previous_columns = []
#this loop repeats once for each column in 'df'
while i <= (int(len(df_s3.drop(['price', 'price_log'], axis=1).columns))):
rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=47)
scores = 0
percent_diffs = 0
#this loop using a repeated k-fold to generate an average R-squared and % difference in train/test MSE
for train_index, test_index in rkf.split(X_train):
X_train2, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
y_train2, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = i)
selector = selector.fit(X_train2, y_train2)
selected_columns = X_train2.columns[selector.support_ ]
linreg.fit(X_train2[selected_columns],y_train2)
scores = scores + linreg.score(X_test[selected_columns], y_test)
y_hat_train = linreg.predict(X_train[selected_columns])
y_hat_test = linreg.predict(X_test[selected_columns])
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
mse_diff = test_mse - train_mse
percent_diff = mse_diff/train_mse
percent_diffs += percent_diff
#adding data to lists
predictors.append(i)
reg_score.append(scores/30)
mse_diffs.append(percent_diffs/30*100)
added_pred.append(list(set(selected_columns) - set(previous_columns)))
previous_columns = selected_columns
predicted = linreg.predict(X_test[selected_columns])
fig, ax = plt.subplots(figsize=(5,5))
#ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot(y_test, y_test, 'k--', lw=4, color='y')
sns.regplot(x=y_test, y=predicted, ax=ax, line_kws={"color": "red"})
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title(str(i) + " Predictors")
ax.legend()
i += 1
results_df_s3['# Predictors'] = predictors
results_df_s3['R-squared'] = reg_score
results_df_s3['MSE % Difference'] = mse_diffs
results_df_s3['Added Predictor'] = added_pred
plt.show()
results_df_s3
#creates a dataframe of both log transformed and non-log transformed coefficients of predictors in model
X = df.drop(['price', 'price_log'], axis=1)
y = df.price_log
model_reg = LinearRegression()
linreg.fit(X, y)
model_coef = pd.DataFrame()
preds = list()
coef_log = list()
coef = list()
sum_coef = 0
percent_coef = list()
i = 0
for col in X.columns:
preds.append(X.columns.values[i])
coef_log.append(linreg.coef_[i])
coef.append(10**(linreg.coef_[i]))
sum_coef += 10**(linreg.coef_[i])
i += 1
model_coef['Predictor'] = preds
model_coef['Coefficient(log)'] = coef_log
model_coef['Coefficient'] = coef
for c in model_coef.Coefficient:
percent_coef.append(c/sum_coef*100)
model_coef['Contribution(%)'] = percent_coef
model_coef.sort_values(by=['Coefficient'], ascending=False)
outcome = 'price'
predictors = df.drop(['price', 'price_log'], axis=1)
pred_sum = "+".join(predictors.columns)
formula = outcome + "~" + pred_sum
model = ols(formula=formula, data=df).fit()
model.summary()